#!/usr/bin/env perl

package dicelister;

use strict;
use warnings;
use autodie;

# UTF-8 support
use utf8;
# To use decode_utf8
require Encode;
# Allow wide character outputs without warning
binmode STDOUT, ":utf8";
# To enable the use of unicode throughout perl functions
use feature 'unicode_strings';
# Anything that opens a filehandle is to assume that that stream is
# encoded in UTF‑8 unless you tell it otherwise
use open qw( :encoding(UTF-8) :std );
# To convert base of a number
use Math::Base::Convert;

# To handle CLI parameters
use Getopt::Long;

# For array_contains subroutine
use List::Util qw(any);
# For fetching remote URL
use HTTP::Tiny;

use File::Basename;
use Cwd 'abs_path';
our $dir = abs_path(dirname($0));
our $data_dir = "${dir}/data";

# To extract HTML
use Mojo::DOM;

# For SHA1 hash
use Digest::SHA qw(sha1_hex);

# Mode
# 1 = Extract mode
# 2 = Index mode
our $mode = 1;

## Default values
# Name used mainly for config and output file names
our $config_name = 'default';
# To hold filename of rough list text file
our $rough_list_file;
# To hold filename of indexed list text file
our $indexed_list_file;
# Minimum length of word allowed to be in list
our $minimum_word_length = 3;
# Maximum length of word allowed to be in list
our $maximum_word_length = 14;
# Excluded strings regex
our $excluded_strings_re;
# Minimum number of dots the dice can get
our $dice_face_min = 1;
# Maximum number of dots the dice can get
our $dice_face_max = 6;
# Diceware index length
our $index_length = 5;
# Cache dir for config
our $cache_dir;

our @unique_words;

our @data_sources = (
);

our @exclude_strings = (
	'.',
	',',
	'!',
	'?',
	';',
	':',
	'(',
	')',
	'[',
	']',
	'{',
	'}',
	'<',
	'>',
	'/',
	'\\',
	'+',
	'-',
	'_',
	'"',
	"'",
	'‘',
	'’',
	'″',
	'°',
	'•',
	'→',
	'←',
	'–',
	'#',
	'&',
	'$',
	'£',
	'%',
	'®',
	'©',
);

sub help_text {
	print("usage: dicelister.pl [-h] [-c CONFIG_FILE] [-e] [-i]

Generates a Diceware wordlist based on config.

optional arguments:
  -h, --help            show this help message and exit
  -c CONFIG_FILE, --config CONFIG_FILE
                        config file (without extension)
  -e, --extract
                        set extract mode
  -i, --index
                        set index mode
");
	exit;
}

# Process CLI parameters and update config values as necessary
GetOptions ("c|config=s"    => \$config_name,
			"e|extract"     => sub { $mode = 1 },
			"i|index"       => sub { $mode = 2 },
			"h|help"        => \&help_text)
or die("Error in command line arguments. Please review and try again.\n");

require "${dir}/config/${config_name}.pl" or die("Error: ${dir}/config/${config_name}.pl is not found");

# Hook: Runs before rough file preparation process is started
if (eval "defined(&after_config_import)") {
	after_config_import();
}

# Variable values based on config name
$rough_list_file = "${config_name}.rough.txt";
$indexed_list_file = "${config_name}.wordlist.txt";
$cache_dir = "${data_dir}/${config_name}/_cache";
unless ( -d "${data_dir}/${config_name}" ) {
	mkdir "${data_dir}/${config_name}";
}

if ( $mode == 1 ) {

	# Hook: Runs before rough file preparation process is started
	if (eval "defined(&before_rough_file_process)") {
		before_rough_file_process();
	}

	## Prepare for regex ##

	# Declare the subroutine if it hasn't been overriden
	unless (eval "defined(&escape_regex)") {
		# Escapes characters in regex pattern.
		# Does not handle characters inside character classes separately.
		# Ref: https://stackoverflow.com/a/400316
		sub escape_regex {
			my $pattern = shift;
			if ( defined $pattern ) {
				$pattern =~ s/(\.|\^|\$|\*|\+|\?|\(|\)|\[|\{|\\|\|)/\\$1/ig;
				return $pattern;
			}
		}
	}
	escape_regex();

	# Prepare the regex string for removal of excluded strings
	unless (eval "defined(&prepare_exclude_regex)") {
		sub prepare_exclude_regex {
			foreach ( @exclude_strings ) {
				$_ = escape_regex($_);
			}
			$excluded_strings_re = join('|', @exclude_strings);
		}
	}
	prepare_exclude_regex();

	## Get source content ##

	unless (eval "defined(&check_sources)") {
		sub check_sources {
			if ( scalar @data_sources < 1 ) {
				die("No sources are defined. Please add some sources in \@data_sources array on your config file.");
			}
		}
	}
	check_sources();

	# Get source content text
	our $text = '';

	unless (eval "defined(&get_source_file)") {
		sub get_source_file {
			my $filepath = shift;
			if ( defined $filepath ) {
				open( my $file, "<", $filepath ) or die("File ${filepath} not found");
				my $file_content = do { local $/; <$file> };
				return "$file_content";
			}
		}
	}

	# Requires: HTTP::Tiny
	unless (eval "defined(&get_source_http)") {
		sub get_source_http {
			my $url = shift;
			my $html;
			# Write to cache
			unless ( -d $cache_dir ) {
				mkdir $cache_dir;
			}
			# substr call is to strip an extra \n at the end
			my $cache_file = substr join('', $cache_dir, '/', sha1_hex($url)), 0, -1;
			# If cache file is not present write HTML to cache
			unless ( -e $cache_file ) {
				my $response = HTTP::Tiny->new->get($url);
				if ( $response->{success} ) {
					open(my $CWF, '>', $cache_file) or print "Can't write cache file. $!\n";
					print $CWF $response->{content};
					close($CWF);
					$html = $response->{content};
				} else {
					print "Failed to fetch ${url} ...\n";
				}
			# If cache is present, use it
			} else {
				print "Found cache... using it instead of fetching...\n";
				open my $CRF, '<', $cache_file or die "Can't open cache file $!";
				$html = do { local $/; <$CRF> };
			}
			# Process
			my $dom = Mojo::DOM->new( $html );
			my $html_text = $dom->find('p')->map('text')->join("\n");
			return Encode::decode_utf8($html_text);
		}
	}

	# Get source content for each data source
	unless (eval "defined(&get_source_contents)") {
		sub get_source_contents {
			foreach (@data_sources) {
				# Local file
				if ( $_ =~ /^file\:\/\/(.*)/i ) {
					print "Processing $1 ...\n";
					my $file_content = get_source_file("$1");
					$text = "$text $file_content";
				# HTTP(S) URL
				} elsif ( $_ =~ /^https*\:\/\//i ) {
					print "Processing $_ ...\n";
					$text = "$text " . get_source_http("$_");
				# String
				} else {
					print join("", "Processing string '", (substr $_, 0, 20) ,"...' ...\n");
					$text = "$text $_";
				}
			}
		}
	}
	get_source_contents();

	# Replace all excluded strings to spaces so that it's easier to process
	$text =~ s/$excluded_strings_re/ /g;

	## Process words ##

	our @words = split(' ', $text);

	unless (eval "defined(&is_numeric)") {
		sub is_numeric {
			my $val = shift;
			if ( defined $val ) {
				return $val =~ /^\d+$/ ? 1 : 0;
			} else {
				warn "No argument given to is_numeric!";
			}
		}
	}

	# Requires: use List::Util qw(any);
	unless (eval "defined(&array_contains)") {
		sub array_contains {
			my $value = shift;
			# https://stackoverflow.com/a/16690762
			return any { $_ eq $value } @_;
		}
	}

	# Process word if it should be added to unique words list.
	unless (eval "defined(&check_word)") {
		sub check_word {
			my $word = shift;
			my $word_length = length($word);
			if ( not array_contains($word, @unique_words)
				and $word_length >= $minimum_word_length
				and $word_length <= $maximum_word_length
				and not is_numeric($word)
				) {
				return 1;
			}
			return undef;
		}
	}

	# Prepare unique word list
	unless (eval "defined(&prepare_unique_words)") {
		sub prepare_unique_words {
			# Prepare unique words array
			foreach (@words) {
				my $word = lc("$_");
				if ( check_word($word) ) {
					push @unique_words, $word;
				}
			}
			@unique_words = sort @unique_words;
			my $words_listed = scalar @unique_words;
			my $max_words_needed = $dice_face_max ** $index_length;
			if ( scalar @unique_words < $max_words_needed ) {
				print "WARNING!! There are only ${words_listed} words extracted from the sources in the config file. But there should be a total of ${max_words_needed} words or more. It may cause malfunction in the passphrase generator.\nPlease add more data_sources in config or add more words manually in the rough list.\n";
			}
		}
	}
	prepare_unique_words();

	# Hook: Runs before rough file is written
	if (eval "defined(&before_rough_file_write)") {
		before_rough_file_write();
	}

	unless (eval "defined(&write_rough_list)") {
		sub write_rough_list {
			if (-e -f "${data_dir}/${rough_list_file}") {
				unlink("${data_dir}/${rough_list_file}") or die "Can't unlink ${data_dir}/${rough_list_file}: $!";
			}
			open( my $RF, ">", "${data_dir}/${rough_list_file}") or die "Cannot open ${data_dir}/${rough_list_file} for write";
			foreach (@unique_words) {
				print $RF "$_\n";
			}
			close $RF;
			print "${data_dir}/${rough_list_file} has been generated. Please edit the file if you need to and run the same command with -i to create the wordlist.\n";
		}
	}
	write_rough_list();

	# Hook: Runs after rough file is written
	if (eval "defined(&after_rough_file_write)") {
		after_rough_file_write();
	}

} elsif ( $mode == 2 ) {

	# Hook: Runs before indexing file process starts
	if (eval "defined(&before_indexed_file_process)") {
		before_indexed_file_process();
	}

	# To aid in getting index number
	my $base_enc = [$dice_face_min..$dice_face_max];
	my $index_conv = new Math::Base::Convert('10', $base_enc);

	# Returns Diceware index number for an $nth item.
	# Params:
	#   1: index - has to be 0-based (starts from 0)
	unless (eval "defined(&get_diceware_index_num)") {
		sub get_diceware_index_num {
			my $n = shift;
			my $ind = eval { $index_conv->cnv($n) };
			my $ind_length = length($ind);
			# Fill up empty spaces with 1s (or $dice_face_min)
			if ( $index_length > $ind_length ) {
				$ind = ( $dice_face_min x ($index_length - $ind_length) ) . $ind;
			} else {
				$ind = ( $dice_face_min x ($ind_length - $index_length) ) . $ind;
			}
			return $ind;
		}
	}

	# Read rough file
	my @file_content;
	unless (eval "defined(&read_rough_file_for_indexing)") {
		sub read_rough_file_for_indexing {
			open my $WLR, "${data_dir}/${rough_list_file}" or die "Could not open ${data_dir}/${rough_list_file}: $!";
			while( my $line = <$WLR>)  {   
				push @file_content, $line;
			}
			close $WLR;
		}
	}
	read_rough_file_for_indexing();

	# Sort before putting into wordlist file
	unless (eval "defined(&sort_rough_file_for_indexing)") {
		sub sort_rough_file_for_indexing {
			@file_content = sort @file_content;
		}
	}
	sort_rough_file_for_indexing();

	# Hook: Runs before indexed file is written
	if (eval "defined(&before_indexed_file_write)") {
		before_indexed_file_write();
	}

	# Write wordlist file
	unless (eval "defined(&write_rough_file_for_indexing)") {
		sub write_rough_file_for_indexing {
			open( my $WLW, ">", "${data_dir}/${indexed_list_file}") or die "Cannot open ${data_dir}/${indexed_list_file} for write";
			my $line_index = 0;
			my $max_index = $dice_face_max x $index_length;
			my $idx;
			foreach (@file_content) {
				# Add the index number and put in the file
				$idx = get_diceware_index_num($line_index);
				print $WLW join('', $idx, "\t$_");
				if ( $idx == $max_index ) {
					print "Reached maximum possible dice index number: ${max_index}.\nWill ignore rest of the words.\nLast word was: $_\n";
					last;
				}
				$line_index++;
			}
			if ( $idx != $max_index ) {
				print "WARNING!! The word count haven't reached the maximum possible dice index number ${max_index} but only at ${idx}.\nThis may result in malfunction in the passphrase generator and may not function as expected.\nPlease add more data_sources in config to increase word count.\n";
			}
			print "${data_dir}/${indexed_list_file} file has been created.\n";
			close $WLW;
		}
	}
	write_rough_file_for_indexing();

	# Hook: Runs after indexed file is written
	if (eval "defined(&after_indexed_file_write)") {
		after_indexed_file_write();
	}

}
